Show the code
import pandas as pd
import polars as pl
import numpy as np
import sqlite3
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)import pandas as pd
import polars as pl
import numpy as np
import sqlite3
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)# Learn morea about Code Cells: https://quarto.org/docs/reference/cells/cells-jupyter.html
# Include and execute your code here
sqlite_file = 'lahmansbaseballdb.sqlite'
# this file must be in the same location as your .qmd or .py file
con = sqlite3.connect(sqlite_file)Write an SQL query to create a new dataframe about baseball players who attended BYU-Idaho. The new table should contain five columns: playerID, schoolID, salary, and the yearID/teamID associated with each salary. Order the table by salary (highest to lowest) and print out the table in your report.
There were two BUY-I players that however only two actually ended up on a team, but the highest had a salary of 4 million.
# Include and execute your code here
q1 = '''
SELECT cp.playerID, cp.schoolID, cp.yearID, s.teamID, MAX(s.salary) as "Salary"
FROM collegeplaying as cp
LEFT JOIN salaries as s
ON cp.playerID = s.playerID
WHERE cp.schoolID == "idbyuid"
GROUP BY cp.playerID
ORDER BY s.salary DESC
-- LIMIT 5
'''
# Using Pandas
res_schl = pd.read_sql_query(q1, con)
display(res_schl)
# Using Polars
print(f"Using Polars")
res_schl_pl = pl.read_database(q1,con)
display(res_schl_pl)| playerID | schoolID | yearID | teamID | Salary | |
|---|---|---|---|---|---|
| 0 | lindsma01 | idbyuid | 2001 | CHA | 4000000.0 |
| 1 | stephga01 | idbyuid | 1991 | SLN | 1025000.0 |
| 2 | catetr01 | idbyuid | 2002 | None | NaN |
Using Polars
| playerID | schoolID | yearID | teamID | Salary |
|---|---|---|---|---|
| str | str | i64 | str | f64 |
| "lindsma01" | "idbyuid" | 2001 | "CHA" | 4e6 |
| "stephga01" | "idbyuid" | 1991 | "SLN" | 1.025e6 |
| "catetr01" | "idbyuid" | 2002 | null | null |
Pick any two baseball teams and compare them using a metric of your choice (average salary, home runs, number of wins, etc). Be creative! Write an SQL query to get the data you need, then make a graph using Lets-Plot to visualize the comparison. What do you learn?
A battle of the Giants shows that the New York Giants do win more often than the San Francisco Giants do. However, the San Francisco Giants have far more homeruns. Regardless of the homeruns a team makes it is the wins that ultimately count.
# Include and execute your code here
q2 = '''
SELECT name, SUM(W) as "Wins", SUM(L) as "Losses", SUM(G) as "Games", ROUND((1.0 * SUM(W) / SUM(G)), 3) as "Win Rate", SUM(HR) as "Homeruns", ROUND(AVG(attendance),0) as "Average Attendance"
FROM teams
WHERE name == "New York Giants" OR name == "San Francisco Giants"
GROUP BY name
'''
# Using Pandas
res_team = pd.read_sql_query(q2, con)
display(res_team)
# Using Polars
print(f"Using Polars")
res_team_pl = pl.read_database(q2,con)
display(res_team_pl)| name | Wins | Losses | Games | Win Rate | Homeruns | Average Attendance | |
|---|---|---|---|---|---|---|---|
| 0 | New York Giants | 6033 | 4855 | 11033 | 0.547 | 5795 | 676358.0 |
| 1 | San Francisco Giants | 5098 | 4789 | 9893 | 0.515 | 8906 | 1950010.0 |
Using Polars
| name | Wins | Losses | Games | Win Rate | Homeruns | Average Attendance |
|---|---|---|---|---|---|---|
| str | i64 | i64 | i64 | f64 | i64 | f64 |
| "New York Giants" | 6033 | 4855 | 11033 | 0.547 | 5795 | 676358.0 |
| "San Francisco Giants" | 5098 | 4789 | 9893 | 0.515 | 8906 | 1.95001e6 |
Win rates of the Giants’ teams.
best_team = (
ggplot(data=res_team_pl)
+ geom_bar(mapping = aes(x = 'name', y = 'Win Rate', fill='name', color='name'), stat='identity')
+ guides(color="none")
+ labs(
title="Win ratio for Giants.",
subtitle="Shown for both Giant teams.",
x="Team",
y="Win Ratio",
fill='Team'
)
+ theme(
panel_background=element_rect(fill='gray'),
plot_background=element_rect(fill='gray'),
panel_grid_major=element_rect(fill='gray'),
legend_background=element_rect(fill='gray'),
axis_text=element_text(color='white'),
axis_title=element_text(color='white'),
plot_title=element_text(color='white'),
plot_subtitle=element_text(color='white'),
legend_text=element_text(color='white'),
legend_title=element_text(color='white')
)
)
best_team
# qt = '''
# SELECT COUNT() as "Count", name
# FROM teams
# GROUP BY name
# ORDER BY Count DESC, name
# '''
# # Using Pandas
# res_temp = pd.read_sql_query(qt, con)
# display(res_temp)
# # Using Polars
# print(f"Using Polars")
# res_temp_pl = pl.read_database(qt, con)
# display(res_temp_pl)Homerun’s of the Giants’ teams.
home_run_team = (
ggplot(data=res_team_pl)
+ geom_bar(mapping = aes(x = 'name', y = 'Homeruns', fill='name', color='name'), stat='identity')
+ guides(color="none")
+ labs(
title="Homeruns for Giants.",
subtitle="Shown for both Giant teams.",
x="Team",
y="Homeruns",
fill='Team'
)
+ theme(
panel_background=element_rect(fill='gray'),
plot_background=element_rect(fill='gray'),
panel_grid_major=element_rect(fill='gray'),
legend_background=element_rect(fill='gray'),
axis_text=element_text(color='white'),
axis_title=element_text(color='white'),
plot_title=element_text(color='white'),
plot_subtitle=element_text(color='white'),
legend_text=element_text(color='white'),
legend_title=element_text(color='white')
)
)
home_run_team